[21]:
# Load all the libraries required for the display of the plots
import sys
import ast
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as ff
import numpy as np
import pandas as pd
import networkx as nx

from math import log, ceil

sys.path.insert(0, '../') # required to import modules from the parent directory
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=False)
colors = plotly.colors.DEFAULT_PLOTLY_COLORS.copy()

Plot Cluster Size of a Single Cluster

Here, we will focus on displaying how the cluster size changes for the cluster that contains a single gene.

[22]:
# We need both the gene and cluster information for a series of readouts
readouts = {
    'Unweighted': '_unw_', # unweighted
    'Aβ40': '_wgt_ab40_',
    'Aβ42': '_wgt_ab42_',
    'sAPPα': '_wgt_sappa_',
    'sAPPβ': '_wgt_sappb_',
    'Viability': '_wgt_via_',
}
[23]:
# We will keep the information in two different DataFrames, one for Cluster Data and one for Gene Data
geneData = pd.DataFrame(columns=['GENEID'])
clusterData = pd.DataFrame(columns=['clu_id'])

for readout in readouts.keys():
    # merge information that says to which cluster each gene belongs
    geneData = geneData.merge(pd.read_csv('../results/chuang'+readouts[readout]+'gene.csv', usecols=['GENEID', 'clu_size', 'clu_id']).rename(columns={'clu_id':readout, 'clu_size': readout+'Size'}), how='outer', on='GENEID')
    # merge information that says what are the genes that belong to each cluster
    clusterData = clusterData.merge(pd.read_csv('../results/chuang'+readouts[readout]+'cluster.csv').rename(columns={'clu_size':readout+'Size', 'clu_genes':readout+'Genes'}), how='outer', on='clu_id')

geneData.set_index(keys='GENEID', inplace=True)
clusterData.set_index(keys='clu_id', inplace=True)
[24]:
# Select the target Gene
gene = int(input())

# These are the cluster IDs of the clusters containing the target gene in each of the
# different results
cluster_ids = geneData.loc[:,~geneData.columns.str.contains('.*Size', case=False)].loc[gene]
cluster_ids
[24]:
Unweighted    332
Aβ40           33
Aβ42           74
sAPPα          74
sAPPβ          34
Viability       0
Name: 351, dtype: int64
[25]:
# We add a column to the dataframe to identify the genes that are part of the hsa05010 pathway (Alzheimer's Disease)
df = pd.read_table('../data/KEGG_hsa05010.tsv', header=None)
df.columns=['GENEID']

geneData['inPathway'] = colors[0]
geneData.loc[geneData.index.isin(df.GENEID.values),'inPathway'] = colors[1]
[26]:
# Do a scatter plot of the cluster sizes
# Define the layout of the graph
fig = go.Figure(
    layout = dict(
        title = 'Size of cluster containing gene '+str(gene)+' per readout',
        yaxis_title = 'Cluster Size',
        xaxis_title = 'siRNA Readout',
        showlegend = True,
        plot_bgcolor = 'white',
        xaxis_gridcolor = 'lightGray',
        yaxis_linecolor = 'black',
        yaxis_gridcolor = 'lightGray',
        yaxis_zerolinecolor = 'lightGray',
    )
)

# Get a list of only the weighted readouts
wgt = (list(readouts.keys()))
wgt.remove('Unweighted')

# Add a horizontal line to trace the size of the unweighted cluster
fig.add_trace(go.Scatter(
    x = wgt,
    y = [geneData.at[gene, 'UnweightedSize']]*len(wgt),
    mode = 'lines+markers',
    line_color = colors[1],
    line_dash = 'dash',
    name = 'Unweighted Cluster Size',
))

# And a different trace trace for the size of the weighted clusters
fig.add_trace(go.Scatter(
    x = wgt,
    y = geneData.filter(regex='(?=.*Size$)').drop(columns=['UnweightedSize']).loc[gene].values,
    mode = 'lines+markers',
    line_color = colors[0],
    name = 'Weighted Cluster Size',
))

fig
[27]:
# Save the corresponding Image
fig.write_image(file='../images/chuang singleClusterSize APP.png', format='png')

Plot Clusters with Pathway highlight

Layout the clusters independently and use color to highlight those genes that are also part of the target pathway

[28]:
# Load all the edges of the network
edgeData = pd.read_table('../data/network1_chuang.txt', header=None)
edgeData.columns = ['x','y','xsymbol','ysymbol']
edgeData
[28]:
x y xsymbol ysymbol
0 6927 55281 HNF1A TMEM140
1 164 8906 AP1G1 AP1G2
2 3689 11082 ITGB2 ESM1
3 57326 151871 PBXIP1 DPPA2
4 5336 23236 PLCG2 PLCB1
... ... ... ... ...
56794 3172 55798 HNF4A METTL2B
56795 5595 5706 MAPK3 PSMC6
56796 6233 9491 RPS27A PSMF1
56797 6868 10459 ADAM17 MAD2L2
56798 2064 2885 ERBB2 GRB2

56799 rows × 4 columns

[29]:
# Add symbols to the DataFrame containing GeneData
df = pd.DataFrame()
df['geneid'] = pd.concat([edgeData['x'],edgeData['y']])
df['symbol'] = pd.concat([edgeData['xsymbol'],edgeData['ysymbol']])
df = df.drop_duplicates()

geneData['symbol'] = df.set_index('geneid')['symbol']
[30]:
# Select the first readout to draw and generate the position of its nodes
read1 = 'Unweighted'

nodes1 = ast.literal_eval(clusterData.at[cluster_ids[read1], read1+'Genes'])
edges1 = edgeData.loc[edgeData['x'].isin(nodes1)].loc[edgeData['y'].isin(nodes1)]

# Create a Networkx Graph with the edges of the network
e = []
edges1.apply(lambda x: e.append((x.x, x.y)), axis=1)
G = nx.Graph()
G.add_nodes_from(nodes1)
G.add_edges_from(e)

# Layout the nodes of the graph, using networkx layout algorithms or layout options
# available through the graphviz library ( prog is an enum from ['dot', 'neato', 'fdp', 'sfdp', 'twopi', 'circo'])
# CAUTION the positioning of the nodes can take several minutes depending on the algorithm used to define the layout]

# Layout using graphviz library
layout = nx.nx_agraph.graphviz_layout(G, prog='sfdp', args='-Gratio=auto -Goverlap=scale')
positions1 = pd.DataFrame.from_dict(layout, orient='index', columns = ['px','py'])
edges1 = edges1.merge(positions1, left_on='x', right_index=True, how='left')
edges1 = edges1.merge(positions1, left_on='y', right_index=True, how='left')
edges1['none'] = 'none'
/home/topeins/anaconda3/envs/mcl/lib/python3.6/site-packages/pygraphviz/agraph.py:1341: RuntimeWarning:

Error: remove_overlap: Graphviz not built with triangulation library


[31]:
# Trace that draws the edges of the first cluster
fig1 = go.Figure()
fig1.add_trace(go.Scatter(
            x = pd.concat([edges1['px_x'],edges1['px_y'],edges1['none']]).sort_index(kind='merge').values.tolist(),
            y = pd.concat([edges1['py_x'],edges1['py_y'],edges1['none']]).sort_index(kind='merge').values.tolist(),
            mode = 'lines',
            line_width = 0.5,
            line_color = '#888',
    ))

# Trace that draws the nodes of the first cluster
fig1.add_trace(go.Scatter(
    x = positions1['px'].values.tolist(),
    y = positions1['py'].values.tolist(),
    mode = 'markers+text',
    text = geneData.loc[nodes1,'symbol'].values.tolist(),
    textposition = 'bottom center',
    marker_size = 10,
    marker_color = geneData.loc[nodes1,'inPathway'].values.tolist(),
    ))
[32]:
# Do the same for a second readout to draw
read2 = 'Aβ42' # 'Aβ40', 'Aβ42', 'sAPPα', 'sAPPβ', 'Viability',

nodes2 = ast.literal_eval(clusterData.at[cluster_ids[read2], read2+'Genes'])
edges2 = edgeData.loc[edgeData['x'].isin(nodes2)].loc[edgeData['y'].isin(nodes2)]

# Create a Networkx Graph with the edges of the network
e = []
edges2.apply(lambda x: e.append((x.x, x.y)), axis=1)
G = nx.Graph()
G.add_nodes_from(nodes2)
G.add_edges_from(e)

# Layout the nodes of the graph, using networkx layout algorithms or layout options
# available through the graphviz library ( prog is an enum from ['dot', 'neato', 'fdp', 'sfdp', 'twopi', 'circo'])
# CAUTION the positioning of the nodes can take several minutes depending on the algorithm used to define the layout]

# Layout using graphviz library
layout = nx.nx_agraph.graphviz_layout(G, prog='sfdp', args='-Gratio=auto -Goverlap=scale')
positions2 = pd.DataFrame.from_dict(layout, orient='index', columns = ['px','py'])
edges2 = edges2.merge(positions2, left_on='x', right_index=True, how='left')
edges2 = edges2.merge(positions2, left_on='y', right_index=True, how='left')
edges2['none'] = 'none'
[33]:
# Trace that draws the edges of the first cluster
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
            x = pd.concat([edges2['px_x'],edges2['px_y'],edges2['none']]).sort_index(kind='merge').values.tolist(),
            y = pd.concat([edges2['py_x'],edges2['py_y'],edges2['none']]).sort_index(kind='merge').values.tolist(),
            mode = 'lines',
            line_width = 0.5,
            line_color = '#888',
    ))

# Trace that draws the nodes of the first cluster
fig2.add_trace(go.Scatter(
    x = positions2['px'].values.tolist(),
    y = positions2['py'].values.tolist(),
    mode = 'markers+text',
    text = geneData.loc[nodes2,'symbol'].values.tolist(),
    textposition = 'bottom center',
    marker_size = 10,
    marker_color = geneData.loc[nodes2,'inPathway'].values.tolist(),
    ))
[34]:
# We plot the different clusters (one per readout) as a Figure with several sub-plots
fig = plotly.subplots.make_subplots(
  rows = 1,
  cols = 2,
  subplot_titles = ([read1, read2]),
  vertical_spacing = 0.05,
  horizontal_spacing = 0.05
)

fig.update_layout(
    title = 'Clusters containing Gene '+geneData.loc[gene,'symbol'],
    width = 1600,
  height = 800,
    plot_bgcolor = 'white',
    showlegend = False,
    margin = dict( t = 70, b = 50, r = 50, l = 50),
    xaxis_showticklabels = False,
    yaxis_showticklabels = False,
    xaxis2_showticklabels = False,
    yaxis2_showticklabels = False,
)

fig.add_trace(fig1.data[0],1,1)
fig.add_trace(fig1.data[1],1,1)
fig.add_trace(fig2.data[0],1,2)
fig.add_trace(fig2.data[1],1,2)
fig

Plot the Cluster’s change in Composition (as network)

Here we intend to show how the genes that make up the cluster containing a target gene change.

[36]:
# Add a new column to the genes DataFrame, where we will store the number of times
# any given gene is part of the cluster containing the target Gene (APP)
geneData['count'] = 0
# iterate through the clusters and modify the count value of the member genes
for i, cid in cluster_ids.items():
    c = ast.literal_eval(clusterData.at[cid, i+'Genes'])
    geneData.loc[c, 'count'] += 1

# Only the genes that are part of the cluster containing the target gene at least once
# are the ones with a count value >= than 1
geneData[geneData['count']>=1]

# Add a column to define the opacity used to draw any particular gene, function of its count number
geneData['opacity'] = geneData['count']/len(cluster_ids)
geneData
[36]:
UnweightedSize Unweighted Aβ40Size Aβ40 Aβ42Size Aβ42 sAPPαSize sAPPα sAPPβSize sAPPβ ViabilitySize Viability inPathway symbol count opacity
GENEID
6927 1876 0 175 0 215 0 1626 0 266 0 2689 0 rgb(31, 119, 180) HNF1A 1 0.166667
164 13 1 6 1 5 1 12 1 11 1 5 1 rgb(31, 119, 180) AP1G1 0 0.000000
3689 10 2 96 2 81 2 42 2 9 2 191 2 rgb(31, 119, 180) ITGB2 0 0.000000
57326 1876 0 1173 3 1444 3 1626 0 1488 3 2689 0 rgb(31, 119, 180) PBXIP1 1 0.166667
5336 88 3 46 4 41 4 9 3 27 4 41 3 rgb(31, 119, 180) PLCG2 0 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4808 2 1611 11 511 13 507 14 503 23 494 11 421 rgb(31, 119, 180) NHLH2 0 0.000000
6474 2 1658 2 1528 2 1472 2 1511 2 1454 2 1296 rgb(31, 119, 180) SHOX2 0 0.000000
10957 15 342 16 177 12 314 13 318 20 145 2689 0 rgb(31, 119, 180) PNRC1 1 0.166667
8780 1876 0 1173 3 1444 3 1626 0 1488 3 2689 0 rgb(31, 119, 180) RIOK3 1 0.166667
2193 1876 0 1 1599 1 1545 268 9 1 1528 211 8 rgb(31, 119, 180) FARSA 0 0.000000

10906 rows × 16 columns

[37]:
# Together with the individual genes, in order to display the clusters as a graph, we need to know the
# edges between the different nodes
edgeData = pd.read_table('../data/network1_chuang.txt', header=None)
edgeData.columns = ['x','y','xsymbol','ysymbol']

# filter out the edges where both end-points are not part of any of the clusters containing the target gene
genes = geneData[geneData['count']>=1].index.tolist()
edgeData = edgeData[edgeData.apply(lambda x: (x.x in genes) and (x.y in genes), axis=1)]
edgeData
[37]:
x y xsymbol ysymbol
0 6927 55281 HNF1A TMEM140
9 5985 5981 RFC5 RFC1
12 3175 80185 ONECUT1 TTI2
33 3172 50 HNF4A ACO2
49 164656 2335 TMPRSS6 FN1
... ... ... ... ...
56763 5721 7316 PSME2 UBC
56766 3172 2252 HNF4A FGF7
56776 5694 5695 PSMB6 PSMB7
56777 4191 5315 MDH2 PKM
56794 3172 55798 HNF4A METTL2B

6474 rows × 4 columns

[38]:
# Create a Networkx Graph with the edges of the network
edges = []
edgeData.apply(lambda x: edges.append((x.x, x.y)), axis=1)
G = nx.Graph()
G.add_edges_from(edges)

# Layout the nodes of the graph, using networkx layout algorithms or layout options
# available through the graphviz library ( prog is an enum from ['dot', 'neato', 'fdp', 'sfdp', 'twopi', 'circo'])
# CAUTION the positioning of the nodes can take several minutes depending on the algorithm used to define the layout

# Layout using graphviz library
# layout = nx.nx_agraph.graphviz_layout(G, prog='dot', args='-Gsize="10,10" -Gratio=fill -Gnodesep=0.1 -Granksep=0.1 -Grankdir=BT')

# Layout using force-directed algorithm implemented in networkx
# layout = nx.kamada_kawai_layout(G)#,scale=2)

# Layout using the shell algorithm implemented in networkx
shells = []
# max number of clusters to which a gene can belong
m = geneData[['count']].max()[0]
for i in range(m,0,-1):
    # if the amount of genes in any given shell is too large, split it into several shells
    s = geneData[geneData['count']== i].index.tolist()
    # max number of genes in the current shell
    while len(s) > 0:
        n = 50*len(shells)
        if len(s) > n:
            shells.append(s[:n])
            del s[:n]
        else:
            shells.append(s)
            s = []
layout = nx.shell_layout(G, shells, scale=len(shells))
print(len(shells))
# layout
13
[39]:
# Define a DataFrame for the list of node positions, and merge this data into the
# geneData DataFrame
layoutData = pd.DataFrame.from_dict(layout, orient='index', columns = ['px','py'])
geneData = geneData.merge(layoutData, how = 'left', right_index = True, left_index = True)
geneData

layoutData.max()
[39]:
px    13.000000
py    12.995393
dtype: float64
[40]:
# For convenience, also merge the positional information of the nodes to the edgeData DataFrame
edgeData = edgeData.merge(layoutData[['px','py']], how = 'left', right_index=True, left_on='x')
edgeData = edgeData.merge(layoutData[['px','py']], how = 'left', right_index=True, left_on='y')
edgeData['none'] = None
edgeData
[40]:
x y xsymbol ysymbol px_x py_x px_y py_y none
0 6927 55281 HNF1A TMEM140 -0.843757 6.948962 -10.303911 3.850899 None
9 5985 5981 RFC5 RFC1 -1.134009 6.907534 -3.401885 -6.117775 None
12 3175 80185 ONECUT1 TTI2 -1.278421 6.882270 -10.397429 3.590745 None
33 3172 50 HNF4A ACO2 -1.849830 6.751158 -10.441730 3.459807 None
49 164656 2335 TMPRSS6 FN1 0.723220 5.956253 -1.647186 5.769469 None
... ... ... ... ... ... ... ... ... ...
56763 5721 7316 PSME2 UBC -1.990811 6.710937 6.187679 -3.273016 None
56766 3172 2252 HNF4A FGF7 -1.849830 6.751158 -8.489806 2.987171 None
56776 5694 5695 PSMB6 PSMB7 6.853987 1.422275 -2.101660 -7.719004 None
56777 4191 5315 MDH2 PKM 3.597208 -6.005006 2.970659 -7.428000 None
56794 3172 55798 HNF4A METTL2B -1.849830 6.751158 4.935251 0.802056 None

6474 rows × 9 columns

[41]:
# We plot the different clusters (one per readout) as a Figure with several sub-plots
fig = plotly.subplots.make_subplots(
    rows = 3, # there are 5 subplots, so we define a grid of 3x2
    cols = 2,
    shared_xaxes = True,
    shared_yaxes = True,
    subplot_titles = (['subtitle']*len(cluster_ids)),
    vertical_spacing = 0.05,
    horizontal_spacing = 0.05,
)
# initial sub-plot data
i = 1
j = 1
a = 'a'

# We add two traces for each cluster, the first one draws only the edges of the cluster
# whilst the second draws the nodes
for readout, id in cluster_ids.items():
    # Generate the data required for the traces
    # Get the list of genes in the current cluster
    cluster = ast.literal_eval(clusterData.at[id, readout+'Genes'])

    # Define a DataFrame that contains only the interactions that are within the cluster
    df = edgeData[edgeData.apply(lambda x: (x.x in cluster) and (x.y in cluster), axis=1)]
    # Trace that draws the corresponding edges
    fig.append_trace(
        go.Scatter(
            x = pd.concat([df['px_x'],df['px_y'],df['none']]).sort_index(kind='merge').values.tolist(),
            y = pd.concat([df['py_x'],df['py_y'],df['none']]).sort_index(kind='merge').values.tolist(),
            mode = 'lines',
            line_width = 0.5,
            line_color = '#888',
            showlegend = False,
        ),i,j,
    )

    # Define a DataFrame that contains only the nodes that are within the cluster
    df = geneData.loc[cluster]
    # Trace that draws the nodes
    fig.append_trace(
        go.Scatter(
            x = df['px'].values.tolist(),
            y = df['py'].values.tolist(),
            mode = 'markers',
            textposition = 'bottom center',
            marker = dict(
                color = colors[0],
                opacity = df['opacity'].tolist(),
                colorscale = plotly.colors.sequential.YlOrRd,
            ),
            showlegend = False,
        ), i,j,
    )

    # Add the subtitle for the current sub-plot
    fig['layout']['annotations'][(2*i+j)-3]['text'] = '('+a+') '+readout

    # update the positioning of the next plot
    j += 1
    a = chr(ord(a)+1)
    if j > 2:
        i += 1
        j = 1

fig.update_layout(
    title = 'Clusters containing Gene '+str(gene)+' per weighting scheme',
    showlegend = False,
    width = 1000,
    height = 1000,
    margin = dict( t = 70, b = 50, r = 50, l = 50),
    plot_bgcolor = 'white',
)

fig.update_xaxes(
    showticklabels = False,
)

fig.update_yaxes(
    showticklabels = False,
    range = [-15,15],
)

# Add a color scale to the figure
n = geneData['count'].max()
scale = [[0, 'rgba(31, 119, 180, '+str(round(1/n,2))+')']]
for i in range(1,n):
    scale.append([i/n, 'rgba(31, 119, 180, '+str(round(i/n,2))+')'])
    scale.append([(i+1)/n, 'rgba(31, 119, 180, '+str(round((i+1)/n,2))+')'])

# scale
colorbar_trace = go.Scatter(
    x=[None],
    y=[None],
    mode='markers',
    hoverinfo='none',
    marker=dict(
        colorscale = scale,
        cmin = 0,
        cmax = n,
        colorbar = dict(
            outlinewidth = 1,
            title= 'Intersections',
            ticks= 'inside',
            tickwidth = 2,
            ticklen = 30,
            tickmode = 'array',
            tickvals = [ i for i in range(n+1)],
        )
    ),
)

fig.add_trace(colorbar_trace)

fig
[42]:
# Save the corresponding Image
fig.write_image(file='../images/chuang clusters containing APP.png', format='png')

Heatmap the Genes in different readouts that belong to a Pathway

Here we intend to show how the overlap between the cluster and some pre-defined pathways.

[43]:
pathways = pd.DataFrame()
pathways['hsa05010'] = pd.read_csv('../data/KEGG_hsa05010.tsv', header=None)[0]
pathways.set_index('hsa05010', inplace=True)

pathways = pathways.merge( right=(geneData['Unweighted']==cluster_ids['Unweighted']), how='left', right_index=True, left_index=True)
pathways = pathways.merge( right=(geneData['Aβ40']==cluster_ids['Aβ40']), how='left', right_index=True, left_index=True)
pathways = pathways.merge( right=(geneData['Aβ42']==cluster_ids['Aβ42']), how='left', right_index=True, left_index=True)
pathways = pathways.merge( right=(geneData['sAPPα']==cluster_ids['sAPPα']), how='left', right_index=True, left_index=True)
pathways = pathways.merge( right=(geneData['sAPPβ']==cluster_ids['sAPPβ']), how='left', right_index=True, left_index=True)

symbols = pd.read_table('../data/network1_chuang.txt',header=None)
symbols.columns=['x','y','xsym','ysym']
s1 = pd.Series(data=symbols.xsym.values.tolist(), index=symbols.x.values.tolist())
s2 = pd.Series(data=symbols.ysym.values.tolist(), index=symbols.y.values.tolist())
s = s1.append(s2).drop_duplicates()

pathways = pathways.merge( right=pd.Series(s, name='symbol'), how='left', right_index=True, left_index=True)

pathways.dropna(inplace=True)
pathways.set_index(['symbol'], append=True,inplace=True)
pathways
[43]:
Unweighted Aβ40 Aβ42 sAPPα sAPPβ
hsa05010 symbol
102 ADAM10 False False False False False
6868 ADAM17 False False False False False
351 APP True True True True True
8883 NAE1 False True True False False
322 APBB1 True True True True True
... ... ... ... ... ... ...
29982 NRBF2 False False False False False
5289 PIK3C3 False False False False False
55102 ATG2B False False False False False
26100 WIPI2 False False False False False
55062 WIPI1 False False False False False

310 rows × 5 columns

[44]:
df = pathways.loc[pathways.any(axis=1)]
fig = go.Figure(
    layout = dict(
        title = 'Genes in APP Cluster that belong to hsa05010',
        yaxis_autorange = 'reversed',
        width = 600,
    )
)

colorscale = [
    [0, 'white'],#white
    [0.5, 'white'],#white
    [0.5, colors[0]],#white
    [1, colors[0]]
]

fig.add_trace(
    go.Heatmap(
        z = df.values.astype('int64').tolist(),
        x = df.columns.values.tolist(),
        y = df.index.get_level_values(1).values.tolist(),
        xgap= 2,
        ygap= 2,
        colorscale = colorscale,
        colorbar = dict(
            thickness  =25,
            tickvals = [0.25, 0.75],
            ticktext = ['False', 'True'],
            outlinewidth=1,
        )
    )
)

fig